require(quanteda)
require(stm)
require(mvtnorm)
require(dichromat)
Paired <- colorschemes$Categorical.12

#### covert the data to stm format ####
load("dfm_matrix.Rdata")

## extract the data from 2003 to 2009 (due to the availability of variables to control district urbanness)
dfm.matrix.after.2003 <- dfm_subset(dfm.matrix, year >= 2003)
dfm.matrix.after.2003 <- dfm_trim(dfm.matrix.after.2003, 
                                  min_termfreq = 1, 
                                  termfreq_type = "count")  # delete unused terms
dim(dfm.matrix.after.2003)  # number of successful candidates
table(docvars(dfm.matrix.after.2003, "female"))  # number of successful women candidates

stm.data.after.2003 <- convert(dfm.matrix.after.2003, to = "stm")
stm.data.after.2003$meta$year <- factor(stm.data.after.2003$meta$year)
stm.data.after.2003$meta$party <- factor(stm.data.after.2003$meta$party, 
                                         levels = c("LDP", "JSP", "Komeito", "JCP", "DPJ", "right", "others"))

#### specify the number of topics ####
## from 10- to 90-topic models
number.of.topics.after.2003.1 <- seq(10, 90, 10)

for (i in 1:length(number.of.topics.after.2003.1)) {
  set.seed(number.of.topics.after.2003.1[i])
  STM.result.after.2003 <- stm(documents = stm.data.after.2003$documents, 
                               vocab = stm.data.after.2003$vocab, 
                               K = number.of.topics.after.2003.1[i], 
                               prevalence = ~ female + year + party + age + I(age ^ 2) + incumbent + wins + DID + over65 + agriculture, 
                               data = stm.data.after.2003$meta)
  save(STM.result.after.2003, file = paste0("STM_result_after_2003_", number.of.topics.after.2003.1[i], ".Rdata"))
}

exclusivity.res.after.2003.1 <- SC.res.after.2003.1 <- rep(NA, length(number.of.topics.after.2003.1))
for (i in 1:length(number.of.topics.after.2003.1)) {
  load(paste0("STM_result_after_2003_", number.of.topics.after.2003.1[i], ".Rdata"))
  exclusivity.res.after.2003.1[i] <- mean(exclusivity(STM.result.after.2003))
  SC.res.after.2003.1[i] <- mean(semanticCoherence(STM.result.after.2003, stm.data.after.2003$documents))
}

# Figure A.11
cairo_pdf("Figure_A11.pdf", 
          width = 5.6, height = 3, pointsize = 8, family = "Helvetica")
layout(matrix(1:2, 1, 2))
par(mar = c(4, 4, 3, 2), lwd = 0.5)
plot(c(1:length(number.of.topics.after.2003.1)), SC.res.after.2003.1, type = "b", 
     main = "Semantic Coherence", xlab = "Number of Topics", ylab = "Average Semantic Coherence", 
     xaxt = "n", yaxt = "n")
axis(1, at = seq(1, length(number.of.topics.after.2003.1), 2), 
     labels = number.of.topics.after.2003.1[seq(1, length(number.of.topics.after.2003.1), 2)], lwd = 0.5)
axis(2, lwd = 0.5)
plot(c(1:length(number.of.topics.after.2003.1)), exclusivity.res.after.2003.1, type = "b", 
     main = "Exclusivity", xlab = "Number of Topics", ylab = "Average Exclusivity", 
     xaxt = "n", yaxt = "n")
axis(1, at = seq(1, length(number.of.topics.after.2003.1), 2), 
     labels = number.of.topics.after.2003.1[seq(1, length(number.of.topics.after.2003.1), 2)], lwd = 0.5)
axis(2, lwd = 0.5)
dev.off()

## from 40- to 60-topic models
number.of.topics.after.2003.2 <- c(41:49, 51:59)

for (i in 1:length(number.of.topics.after.2003.2)) {
  set.seed(number.of.topics.after.2003.2[i])
  STM.result.after.2003 <- stm(documents = stm.data.after.2003$documents, 
                               vocab = stm.data.after.2003$vocab, 
                               K = number.of.topics.after.2003.2[i], 
                               prevalence = ~ female + year + party + age + I(age ^ 2) + incumbent + wins + DID + over65 + agriculture, 
                               data = stm.data.after.2003$meta)
  save(STM.result.after.2003, file = paste0("STM_result_after_2003_", number.of.topics.after.2003.2[i], ".Rdata"))
}

number.of.topics.after.2003.3 <- 40:60

exclusivity.res.after.2003.2 <- SC.res.after.2003.2 <- rep(NA, length(number.of.topics.after.2003.3))
for (i in 1:length(number.of.topics.after.2003.3)) {
  load(paste0("STM_result_after_2003_", number.of.topics.after.2003.3[i], ".Rdata"))
  exclusivity.res.after.2003.2[i] <- mean(exclusivity(STM.result.after.2003))
  SC.res.after.2003.2[i] <- mean(semanticCoherence(STM.result.after.2003, stm.data.after.2003$documents))
}

# Figure A.12
cairo_pdf("Figure_A12.pdf", 
          width = 5.6, height = 3, pointsize = 8, family = "Helvetica")
layout(matrix(1:2, 1, 2))
par(mar = c(4, 4, 3, 2), lwd = 0.5)
plot(c(1:length(number.of.topics.after.2003.3)), SC.res.after.2003.2, type = "b", 
     main = "Semantic Coherence", xlab = "Number of Topics", ylab = "Average Semantic Coherence", 
     xaxt = "n", yaxt = "n")
axis(1, at = seq(1, length(number.of.topics.after.2003.3), 2), 
     labels = number.of.topics.after.2003.3[seq(1, length(number.of.topics.after.2003.3), 2)], lwd = 0.5)
axis(2, lwd = 0.5)
plot(c(1:length(number.of.topics.after.2003.3)), exclusivity.res.after.2003.2, type = "b", 
     main = "Exclusivity", xlab = "Number of Topics", ylab = "Average Exclusivity", 
     xaxt = "n", yaxt = "n")
axis(1, at = seq(1, length(number.of.topics.after.2003.3), 2), 
     labels = number.of.topics.after.2003.3[seq(1, length(number.of.topics.after.2003.3), 2)], lwd = 0.5)
axis(2, lwd = 0.5)
dev.off()

#### list of topics (Online Appendix F.2) ####
load("STM_result_after_2003_48.Rdata")

topic.order.after.2003 <- order(colMeans(STM.result.after.2003$theta), decreasing = TRUE)
data.frame(labelTopics(STM.result.after.2003, n = 5)$prob, 
           prob = round(colMeans(STM.result.after.2003$theta), 3) * 100)[topic.order.after.2003, ]

topic.labels.after.2003 <- c("Postal privatization", "Administrative efficiency", "Candidate information", "JCP's economic policy", "Small and medium-sized enterprises", 
                             "Anti-Koizumi government", "Child care", "Anti-neoliberal policy", "Agriculture and fisheries", "Appeals for change", 
                             "Energy policy", "Tax increase", "North Korea", "Pacifism", "Health inequality", 
                             "Public pension", "Community development", "DPJ manifesto (2005)", "Criticism of the government (2003)", "Local administration reform", 
                             "Doshu system", "Structural reform", "Elderly care", "Welfare", "Economic measure", 
                             "Criticism of the government (2009)", "Highway", "SDP manifesto (2009)", "Postal savings", "Traffic network development", 
                             "JCP manifesto (2005)", "Anti-corruption policy", "Appeals for hearing voters' demands", "JCP manifesto (2003)", "Agricultural administration", 
                             "Okinawan politics", "Criticism of the government (2005)", "Urban development", "Support for rural areas", "DPJ manifesto (2009)", 
                             "Regime change", "International issues", "Community education", "Legislative records", "Budget", 
                             "City declaration", "Social issues", "Appeals for responsibility")
colnames(STM.result.after.2003$theta) <- topic.labels.after.2003

#### post-estimation simulation ####
## conduct the post-estimation simulation
set.seed(12345)
post.sim.after.2003 <- estimateEffect(formula(1:48 ~ female + year + party + age + I(age ^ 2) + incumbent + wins + DID + over65 + agriculture), 
                                      STM.result.after.2003, stm.data.after.2003$meta, nsims = 100)
summary.post.sim.after.2003 <- summary(post.sim.after.2003)

## which topics show a large gender difference
gender.diff.after.2003 <- matrix(NA, 48, 4)
for (i in 1:48) {
  gender.diff.after.2003[i, ] <- summary.post.sim.after.2003$tables[[i]][2, ]  # the second row corresponds to the coefficient of the female dummy
}
rownames(gender.diff.after.2003) <- topic.labels.after.2003
significant.topics.after.2003 <- p.adjust(gender.diff.after.2003[, 4], "holm") < 0.05
round(gender.diff.after.2003[significant.topics.after.2003, ], 3)  # topics whose gender difference is significant with the Holm correction
topic.labels.after.2003[significant.topics.after.2003]
gender.diff.after.2003.order <- order(gender.diff.after.2003[, 1], decreasing = TRUE)

# top 10 most frequent words in "welfare" (Online Appendix footnote 5)
labelTopics(STM.result.after.2003, n = 10)$prob[24, ]

# top 10 most frequent words in "community education" (Online Appendix footnote 6)
labelTopics(STM.result.after.2003, n = 10)$prob[43, ]

# top 10 most frequent words in "pacifism" (Online Appendix footnote 6)
labelTopics(STM.result.after.2003, n = 10)$prob[14, ]

# top 10 most frequent words in "child care" (Online Appendix footnote 6)
labelTopics(STM.result.after.2003, n = 10)$prob[7, ]

#### compute the first-differences (FDs) ####
## function implementing the procedure explained in Online Appendix A.3
predict.fun.model1 <- function(result, covname, cov.value1, cov.value2, sim.beta) {
  cdata <- result$data
  cdata[, covname] <- cov.value1
  design.matrix <- makeDesignMatrix(result$formula, result$data, cdata, sparse = FALSE)
  sim.result.1 <- tcrossprod(design.matrix, sim.beta)
  cdata[, covname] <- cov.value2
  design.matrix <- makeDesignMatrix(result$formula, result$data, cdata, sparse = FALSE)
  sim.result.2 <- tcrossprod(design.matrix, sim.beta)
  dif.sim.result <- sim.result.1 - sim.result.2
  dif.sim.mean <- mean(dif.sim.result)
  dif.sim.CI <- quantile(colMeans(dif.sim.result), c(0.025, 0.975))
  c(dif.sim.mean,  # point estimate of FD
    dif.sim.CI,  # confidence interval
    mean(sim.result.1),  # expected prevalence when covname takes cov.value1
    mean(sim.result.2))  # expected prevalence when covname takes cov.value2
}

## compute the FDs for gender
female.sim.result.after.2003 <- matrix(NA, 48, 5)
for (i in 1:48) {
  set.seed(12345)
  sim.beta <- do.call(rbind, 
                      lapply(post.sim.after.2003$parameters[[i]], 
                             function(x) rmvnorm(100, x$est, x$vcov)))
  female.sim.result.after.2003[i, ] <- predict.fun.model1(post.sim.after.2003, "female", 1, 0, sim.beta)
}

## how large the differences in female-stereotypic topics are
rownames(female.sim.result.after.2003) <- topic.labels.after.2003
round(female.sim.result.after.2003[c(24, 43, 14, 7), ], 3)
round(female.sim.result.after.2003[c(24, 43, 14, 7), 4] / 
        female.sim.result.after.2003[c(24, 43, 14, 7), 5], 2)

# Figure A.13
cairo_pdf("Figure_A13.pdf", 
          width = 6, height = 5, pointsize = 7, family = "Helvetica")
par(mar = c(4, 1, 1, 1), lwd = 0.5)
plot(NULL, NULL, type = "n", bty = "n", 
     xlim = c(-0.055, 0.042), ylim = c(1, 48), 
     xlab = "", ylab = "", xaxt = "n", yaxt = "n")
abline(v = seq(-0.03, 0.04, 0.01), col = "gray")
segments(rep(-0.032, 48), 1:48, 
         rep(0.042, 48), 1:48, lty = 3, col = "gray")
points(female.sim.result.after.2003[gender.diff.after.2003.order, 1], 48:1, 
       pch = ifelse(significant.topics.after.2003[gender.diff.after.2003.order], 15, 19), 
       col = ifelse(significant.topics.after.2003[gender.diff.after.2003.order], Paired[12], "black"))
segments(female.sim.result.after.2003[gender.diff.after.2003.order, 2], 48:1, 
         female.sim.result.after.2003[gender.diff.after.2003.order, 3], 48:1, 
         lwd = 0.5, col = ifelse(significant.topics.after.2003[gender.diff.after.2003.order], Paired[12], "black"))
text(rep(-0.032, 48), 48:1, 
     labels = topic.labels.after.2003[gender.diff.after.2003.order], pos = 2, cex = 0.8)
axis(1, at = c(seq(-0.03, 0.04, 0.01)), lwd = 0.5)
mtext(c("more prevalent\namong men", "more prevalent\namong women"), 
      side = 1, at = c(-0.03, 0.04), line = 3)
dev.off()